#Libraries

#The libraries necessary for the Project
library(e1071)
library(textstem)
library(udpipe)
library(sentimentr)
library(ggraph)
library(wordcloud)
library(wordcloud2)
library(tidygraph)
library(viridisLite)
library(visNetwork)
library(tidyverse)
library(textdata)
library(tidytext)
library(vosonSML)
library(igraph)
library(SentimentAnalysis)
library(SnowballC)
library(dplyr)
library(tm)
library(stringr)
library(ggplot2)
library(ggpmisc)
library(psych)

#Getting the Data ##Youtube API

# Getting the API Key from API file
file_path <- "API_Key.txt"
# Accessing the Comments from the API of Youtube
API <- read.table(file_path, header = F, sep = "\t")
youtubeAuth = Authenticate("youtube", apiKey = API[1,1])
videoID = "https://www.youtube.com/watch?v=VGWGcESPltM"
youtubeData = youtubeAuth |>
  Collect(videoID, maxComments = 111000, verbose = FALSE, writeToFile = FALSE)

rm(youtubeAuth)
rm(videoID)

##Saving the Original Youtube Data

write.csv(youtubeData, "YoutubeData.csv", row.names = T)

##Reading the Original Youtube Data

youtubeData <- read.csv("YoutubeData.csv")

##Taking Care of Variable Types

#Changing the variable type of the columns to fit the Analysis
youtubeData$ReplyCount = as.numeric(youtubeData$ReplyCount)
youtubeData$Comment = enc2utf8(youtubeData$Comment)
youtubeData$LikeCount = as.numeric(youtubeData$LikeCount)

##Removing Useless columns

#Columns to Remove
columns_to_remove <- c('X', 'AuthorProfileImageUrl', 'AuthorChannelUrl', 'AuthorChannelID', 'UpdatedAt', 'VideoID')

# Removing the Useless Columns
youtubeData <- youtubeData[, !(names(youtubeData) %in% columns_to_remove)]
colnames(youtubeData)
## [1] "Comment"           "AuthorDisplayName" "ReplyCount"       
## [4] "LikeCount"         "PublishedAt"       "CommentID"        
## [7] "ParentID"
rm(columns_to_remove)

#Descriptive Analysis ##Reply Count Distribution

ggplot(youtubeData, aes(x = ReplyCount)) +
  geom_histogram(binwidth = 4, fill = "skyblue", color = "black", alpha = 0.7) +
  labs(title = "Distribution of ReplyCount",
       x = "ReplyCount",
       y = "Frequency")

ggplot(youtubeData, aes(x = ReplyCount)) +
  geom_histogram(binwidth = 3, fill = "skyblue", color = "black", alpha = 0.7) +
  scale_y_log10() + 
  labs(title = "Distribution of ReplyCount (log scale)",
       x = "ReplyCount",
       y = "Frequency (log scale)")+ 
    theme(axis.title = element_text(size = 14)) 

#In this code, the scale_y_log10() function is added to the ggplot object, which transforms the y-axis to a logarithmic scale. This can help in better visualizing the distribution when there are a large number of values close to zero.

##LikeCount Distribution

ggplot(youtubeData, aes(x = LikeCount)) +
  geom_histogram(binwidth = .1, fill = "skyblue", color = "black", alpha = 0.7) +
  scale_x_log10() +
  labs(title = "Distribution of Like Count (log scale)",
       x = "LikeCount",
       y = "Frequency (log scale)") + 
    theme(axis.title = element_text(size = 14))

##Numerical Variables Summary

columns_of_interest <- c("ReplyCount", "LikeCount")

# Get a numerical summary for each numerical column
for (column in columns_of_interest) {
  cat("Summary for column:", column, "\n")
  print(summary(youtubeData[[column]]))
  cat("\n")
}
## Summary for column: ReplyCount 
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##   0.0000   0.0000   0.0000   0.2573   0.0000 485.0000 
## 
## Summary for column: LikeCount 
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##      0.00      0.00      0.00     11.23      1.00 117913.00
rm(columns_of_interest)
rm(column)

##Authors Frequency

#Get the frequency of each unique value in the "AuthorDisplayName" column and
#Sort the Authors of the Comments in a frequency table
sorted_value_counts <- table(youtubeData$AuthorDisplayName) %>% sort(decreasing = TRUE) %>% head(20)
# Create a data frame from the sorted frequency table
df <- data.frame(AuthorDisplayName = names(sorted_value_counts),
                 Frequency = sorted_value_counts)
rm(sorted_value_counts)


# Create a bar plot
ggplot(df, aes(x = reorder(AuthorDisplayName, -Frequency.Freq), y = Frequency.Freq)) +
  geom_bar(stat = "identity", fill = "skyblue", color = "black") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Frequency of Author Display Names",
       x = "Author Display Name",
       y = "Frequency")

rm(df)

##Relationship Between Likes and Replies

ggplot(youtubeData, aes(x=LikeCount, y=ReplyCount)) + 
  geom_hex(bins=20, color="white") + 
  scale_fill_gradient(low = "green", high="red") + 
  geom_smooth(method=lm, formula = y ~ poly(x,2)) + 
  stat_poly_eq(aes(label=paste(..eq.label.., ..adj.rr.label.., sep="~~~~")),
               formula = y ~ poly(x,2), parse = T) +
  labs(title="Density plot of Reply Counts and  Comment Likes") + 
    theme(axis.title = element_text(size = 14)) 

#Sentiment Analysis ##Cleaning the Text ###Remove NAs

#Removing NA comments if any
youtubeData <- youtubeData %>%
  filter(!is.na(Comment))

###Lower Case

#making the text all lower case
youtubeData$Comment <- tolower(youtubeData$Comment)

###Word Frequency Before Further Cleaning

#Let's see the most frequent words before removing the stop words
words = tibble(line = 1:nrow(youtubeData), text = youtubeData$Comment) %>% unnest_tokens(word,text) 
counts = words %>% count(word, sort=T)
# Select the top 20 words
top20 <- counts %>% slice_head(n = 20)
# Create the ggplot
ggplot(top20, aes(x = reorder(word, n), y = n)) +
  geom_col() +
  labs(y = NULL) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

rm(counts)
rm(top20)

###Word Cloud Before Further Cleaning

#Transform the bar plot to word cloud for better visualization
words %>%
  count(word, sort = TRUE) %>%
  head(100) %>% 
  wordcloud2(size = .8, shape = 'triangle-forward', 
             color = c("steelblue", "firebrick", "darkorchid"), 
             backgroundColor = "white")
rm(words)

###Stop words

#Removing the Stop Words
youtubeData$Comment <- removeWords(youtubeData$Comment, stopwords("en"))

###Custom Stop words

#Removing the Custom Stop Words
custom_stopwords = c("s","t","m","re","don", "doesn", "don", "ve", 
  "didn", "u", "isn", "u","p",  "{",  "}",  "\\","'")

youtubeData$Comment <- removeWords(youtubeData$Comment, custom_stopwords)
rm(custom_stopwords)

###Lemmitization

youtubeData$Comment = lemmatize_words(youtubeData$Comment)

###Tokenization

SentimentData = tibble(line = 1:nrow(youtubeData), text = youtubeData$Comment)
SentimentData_tokens = SentimentData %>% unnest_tokens(word,text)
SentimentData_tokens %>% count(word,sort=T)
## # A tibble: 46,931 × 2
##    word          n
##    <chr>     <int>
##  1 piers     50099
##  2 tate      34063
##  3 andrew    26253
##  4 like      15979
##  5 just      14200
##  6 man       12470
##  7 interview 11987
##  8 people    11147
##  9 morgan    11019
## 10 can       10373
## # ℹ 46,921 more rows

###Word Frequency After Further Cleaning

counts = SentimentData_tokens %>%
  count(word, sort=T)

# Select the top 20 words
top20 <- counts %>% slice_head(n = 20)
print(top20)
## # A tibble: 20 × 2
##    word          n
##    <chr>     <int>
##  1 piers     50099
##  2 tate      34063
##  3 andrew    26253
##  4 like      15979
##  5 just      14200
##  6 man       12470
##  7 interview 11987
##  8 people    11147
##  9 morgan    11019
## 10 can       10373
## 11 women      8758
## 12 say        7471
## 13 authority  7252
## 14 think      6926
## 15 get        6901
## 16 men        6805
## 17 said       6733
## 18 one        6688
## 19 let        6277
## 20 woman      6017
# Create the ggplot
ggplot(top20, aes(x = reorder(word, n), y = n)) +
  geom_col() +
  labs(y = NULL) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

rm(top20)
rm(counts)

###Word Cloud After Further Cleaning

#Transform the bar plot to word cloud for better visualization
SentimentData_tokens %>%
  count(word, sort = TRUE) %>%
  head(100) %>% 
  wordcloud2(size = 1, shape = 'triangle-forward', 
             color = c("steelblue", "firebrick", "darkorchid"), 
             backgroundColor = "white")
rm(SentimentData)
rm(SentimentData_tokens)

##Words Graph ###Creating the Words Graph Dataset

#Here the goal is to classify the comments that talk about Andrew Tate, and the ones that talk about Piers Morgan.
Result <- data.frame(Name = rep(NA, nrow(youtubeData)),
                     Attribute = rep(NA, nrow(youtubeData)))

for (i in 1:nrow(youtubeData)) {
if  ((grepl('piers', youtubeData$Comment[i], ignore.case = TRUE) | 
      grepl('pierce', youtubeData$Comment[i], ignore.case = TRUE) | 
     grepl('morgan', youtubeData$Comment[i], ignore.case = TRUE)) &
     (grepl('andrew', youtubeData$Comment[i], ignore.case = TRUE) | 
     grepl('tate', youtubeData$Comment[i], ignore.case = TRUE) | 
     grepl('top', youtubeData$Comment[i], ignore.case = TRUE) | 
     grepl('tate.', youtubeData$Comment[i], ignore.case = TRUE) | 
     grepl(' g ', youtubeData$Comment[i], ignore.case = TRUE))){
  next
} 

if (grepl('piers', youtubeData$Comment[i], ignore.case = TRUE) | 
    grepl('pierce', youtubeData$Comment[i], ignore.case = TRUE) |
    grepl('morgan', youtubeData$Comment[i], ignore.case = TRUE)) {
    Result$Name[i] <- 'Piers Morgan'
    Result$Attribute[i] <- youtubeData$Comment[i]
  }
if (grepl('andrew', youtubeData$Comment[i], ignore.case = TRUE) | 
    grepl('tate', youtubeData$Comment[i], ignore.case = TRUE) | 
    grepl('top', youtubeData$Comment[i], ignore.case = TRUE) | 
    grepl('tate.', youtubeData$Comment[i], ignore.case = TRUE) | 
    grepl('topg', youtubeData$Comment[i], ignore.case = TRUE) | 
    grepl(' g ', youtubeData$Comment[i], ignore.case = TRUE)) {
    Result$Name[i] <- 'Andrew Tate'
    Result$Attribute[i] <- youtubeData$Comment[i]
  }
}
Result <- Result[complete.cases(Result$Name), ]
rm(i)

###Cleaning the Words Geaph Dataset

#To remove double spaces
Result$Attribute <- gsub("\\s+", " ", Result$Attribute)
#To remove the space at the beginning of the comment
Result$Attribute <- gsub("^\\s+", "", Result$Attribute)
# Function to split attributes and count frequency
unwanted_values <- c("morgan", "tate", "top", "g","piers","andrew","topg", "pierce")
result_data = Result %>%
  separate_rows(Attribute, sep = ' ') %>%
  mutate(Attribute = str_trim(Attribute)) %>% na.omit() %>% filter(Attribute != "") %>% filter(!Attribute %in% unwanted_values) 
rm(unwanted_values)
rm(Result)
#To get the frequency of the words and to which name they are attached
result_data_final <- result_data %>%
  group_by(Name, Attribute) %>%
  summarise(Frequency = n()) %>%
  top_n(100, Frequency)
rm(result_data)
#Remove Useless words
stop_words <- c(
  "just", "can", "people", "just", "don", "say", "think", "said", "can", 
  "get", "let", "will", "one", "even", "way", "make", "things", "know", 
  "says", "doesn", "saying", "let", "one", "don", "trying", "get", "time", 
  "want", "now", "see", "trying", "say", "someone", "also", "go", "every", 
  "many", "got", "look", "thing", "even", "actaully", "got", "ve", 
  "someone", "make", "said", "still", "show", "every", "something", 
  "will", "didn", "lot", "way", "u", "going", "everything", "back", 
  "much", "word", "always", "give", "actually", "saying", "see", 
  "made", "keep", "whole", "video", "now", "isn", "go", "years", 
  "u", "us", "ever", "words", "dont", "really", "watch", "person", "point",
  "pierce", "tates","pierse","pierce.", "andrews","tate.","don"
)

unique_stop_words <- unique(stop_words)

# Remove rows where the Attribute column contains any stop word
result_data_filtered <- result_data_final %>%
  filter(!grepl(paste(unique_stop_words, collapse="|"), Attribute, ignore.case=TRUE))
rm(result_data_final)
rm(unique_stop_words)
rm(stop_words)

#Final Data for the word graph
result_df <- aggregate(Frequency ~ Name + Attribute, data = result_data_filtered, sum)
result_df <- result_df %>%
  filter(nchar(Attribute) > 1)
rm(result_data_filtered)

###Plot of Words Graph

#Plot
list_network = graph.data.frame(result_df, directed = F)
plot(list_network,vertex.size=10,vertex.label.cex=.8)

rm(result_df)
# Assigning colors to edges based on frequency
edge_colors <- ifelse(E(list_network)$Frequency > 1000, "red", "gray")

# Assigning colors to the two individuals
node_colors <- ifelse(V(list_network)$name %in% c("Andrew Tate", "Piers Morgan"), "blue", "green")

# Plotting the network with customized colors
plot(
  list_network,
  vertex.size = 12,
  vertex.label.cex = 0.8,
  edge.color = edge_colors,
  vertex.color = node_colors
)

rm(list_network)
rm(node_colors)
rm(edge_colors)

##Creating Sentiment Analysis Data

#Using the Sentiment Analysis Library
sentences <- get_sentences(youtubeData$Comment)
sentiment_scores <- sentiment(sentences)

result <- sentiment_scores %>%
  group_by(element_id) %>%
  summarize(sentiment = mean(sentiment), word_count = sum(word_count))
rm(sentences)
rm(sentiment_scores)
# Combine sentiment scores with the original data frame
youtubeData <- cbind(youtubeData, result)
rm(result)
columns_to_remove = c('element_id','sentence_id')
youtubeData <- youtubeData[, !(colnames(youtubeData) %in% columns_to_remove)]
rm(columns_to_remove)

###Descriptive Analysis of Sentiment Scores

#Distribution of Sentiment Scores
ggplot(youtubeData, aes(x = sentiment)) +
  geom_histogram(aes(y = ..density..), binwidth = 0.05, fill = "skyblue", color = "black", alpha = 0.7) +
  stat_function(fun = dnorm, args = list(mean = mean(youtubeData$sentiment), sd = sd(youtubeData$sentiment)), color = "red", size = 1) +
  labs(title = "Sentiment Histogram",
       x = "Sentiment",
       y = "Density") +
  theme_minimal() +
  theme(axis.title = element_text(size = 14)) +
  xlim(c(-1, 1))

#Skewness
skewness_value <- skewness(youtubeData$sentiment)
print(skewness_value)
## [1] 0.05120366
rm(skewness_value)
positive_values <- sum(youtubeData$sentiment > 0)
negative_values <- sum(youtubeData$sentiment < 0)

# Calculate the percentage
percentage_positive <- (positive_values / (positive_values + negative_values)) * 100

cat("Percentage of positive values over negative values:", percentage_positive, "%\n")
## Percentage of positive values over negative values: 51.65105 %
rm(positive_values)
rm(negative_values)
rm(percentage_positive)
#Scatterplot of Sentiment by Date
ggplot(youtubeData, aes(x = PublishedAt, y = sentiment, color = sentiment)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  labs(title = "Scatterplot of Sentiment by Date") +
  scale_color_gradient(low = "red", high = "green")

#Plot by Likes
ggplot(youtubeData, aes(x=LikeCount, y=sentiment)) + 
  geom_hex(bins=20, color="white") + 
  scale_fill_gradient(low = "green", high="red") + 
  geom_smooth(method=lm, formula = y ~ poly(x,2))  +
  labs(title="Density plot of Sentiment in Youtube Video by Comment Likes")

#Correlation between likes and sentiment
correlation_value <- cor(youtubeData$sentiment, youtubeData$LikeCount, method = "spearman")
cat("Correlation between LikeCount and the sentiment score:", correlation_value, "\n")
## Correlation between LikeCount and the sentiment score: 0.07446133
rm(correlation_value)
#Plot by Reply Count
ggplot(youtubeData, aes(x=ReplyCount, y=sentiment)) + 
  geom_hex(bins=20, color="white") + 
  scale_fill_gradient(low = "green", high="red") + 
  geom_smooth(method=lm, formula = y ~ poly(x,2)) +
  labs(title="Density plot of Sentiment in Youtube Video by Comment Replies")

#Correlation between sentiment and replies
correlation_value <- cor(youtubeData$sentiment, youtubeData$ReplyCount, method = "spearman")
cat("Correlation between LikeCount and the sentiment score:", correlation_value, "\n")
## Correlation between LikeCount and the sentiment score: 0.03431116
rm(correlation_value)
#Mean and Median of the sentiment
mean(youtubeData$sentiment,na.rm=T)
## [1] 0.008401548
median(youtubeData$sentiment,na.rm=T)
## [1] 0

##Sentiment Analysis on all the comments

#Transforming the data drom individual comments to a unified text
all_comments = tibble(line = 1:nrow(youtubeData), text = youtubeData$Comment)
all_comments = all_comments %>% unnest_tokens(word,text)

#Counting the frequency
all_comments %>% count(word,sort=T)
## # A tibble: 46,931 × 2
##    word          n
##    <chr>     <int>
##  1 piers     50099
##  2 tate      34063
##  3 andrew    26253
##  4 like      15979
##  5 just      14200
##  6 man       12470
##  7 interview 11987
##  8 people    11147
##  9 morgan    11019
## 10 can       10373
## # ℹ 46,921 more rows
#Using the nrc library to get sentiment on the comments
sentiments = get_sentiments("nrc")
sentiments %>% count(sentiment,sort=T)
## # A tibble: 10 × 2
##    sentiment        n
##    <chr>        <int>
##  1 negative      3316
##  2 positive      2308
##  3 fear          1474
##  4 anger         1245
##  5 trust         1230
##  6 sadness       1187
##  7 disgust       1056
##  8 anticipation   837
##  9 joy            687
## 10 surprise       532
#Combining the comments with their respective sentiments
all_comments_sentiment = all_comments %>% inner_join(sentiments) 
rm(sentiments)
rm(all_comments)
all_comments_sentiment %>% count(sentiment,sort=T)
## # A tibble: 10 × 2
##    sentiment         n
##    <chr>         <int>
##  1 positive     139938
##  2 negative     111924
##  3 trust         91315
##  4 anger         57574
##  5 anticipation  56019
##  6 fear          55769
##  7 sadness       50091
##  8 joy           43772
##  9 disgust       39005
## 10 surprise      23132
# color mapping for each sentiment
color_mapping <- c(
  "positive" = "green",
  "trust" = "#2EB62C",
  "anticipation" = "#83D475",
  "joy" = "#ABE098",
  "surprise" = "#C5E8B7",
  "disgust" = "#EA4C46",
    "negative" = "red",
    "fear" = "#F6BDC0",
      "sadness" = "#F1959B",
    "anger" = "#F07470"
)
#Histogram
all_comments_sentiment %>%
  count(sentiment) %>%
  ggplot(aes(reorder(sentiment, -n), n, fill = sentiment)) +
  geom_col(color = "black") + 
  scale_fill_manual(values = color_mapping, guide = FALSE) + 
  labs(title = "Distribution of Different Emotions",
       x = "Sentiments",
       y="Frequency")

rm(color_mapping)
#Word Cloud for negative words
all_comments_sentiment_pivot =  all_comments_sentiment %>% count(word,sentiment,sort=T) %>% pivot_wider(names_from = sentiment, values_from=n,values_fill=0)
all_comments_sentiment_pivot %>% with(wordcloud(word,negative,min.freq = 200,random.order = F,colors = brewer.pal(8, "PuOr")))

#Word Cloud for positive words
all_comments_sentiment_pivot %>% with(wordcloud(word,positive,min.freq = 200,random.order = F,colors = brewer.pal(8, "PuOr")))

rm(all_comments_sentiment)
rm(all_comments_sentiment_pivot)

#Network Graph ##Make the Data less voluminous

youtubeData_igraph = youtubeData
youtubeData_igraph$CommentID <- sub(".*\\.(.*)", "\\1", youtubeData_igraph$CommentID)
youtubeData_igraph <- youtubeData_igraph[order(-youtubeData_igraph$ReplyCount), ]

# Selecting the top values of the "CommentID" by ReplyCount
top_comments <- head(youtubeData_igraph$CommentID, 14)

#Remove the parent comments that have less than 40 reply count
youtubeData_igraph <- youtubeData_igraph[!(youtubeData_igraph$ReplyCount < 100 & is.na(youtubeData_igraph$ParentID)), ]

#Remove the replies to the comments that are not in the top comments
youtubeData_igraph <- youtubeData_igraph[youtubeData_igraph$ParentID %in% top_comments | is.na(youtubeData_igraph$ParentID), ]
rm(top_comments)

##igraph ###Creating The Nodes / Edges

#Get the type of the comments
type_list = c()
num_rows = nrow(youtubeData_igraph)
for (i in 1:num_rows){
  if (!is.na(youtubeData_igraph[i,7])){
    type_list <- append(type_list, "Reply")
}  else {
      type_list <- append(type_list, "Parent")
  }
}
rm(i)
rm(num_rows)

###Creating the igraph object

library(igraph)
g = graph.empty(directed = T)

#Nodes
for (i in youtubeData_igraph$AuthorDisplayName) {
  g = g + vertex(i)

}
# Add edges with weights
num_rows <- nrow(youtubeData_igraph)
for (i in 1:num_rows) {
  if (!is.na(youtubeData_igraph[i, 7])) {
    matching_rows <- which(youtubeData_igraph$CommentID == youtubeData_igraph[i, 7])
    for (j in matching_rows) {
      g <- g + edge(youtubeData_igraph[i, 2], youtubeData_igraph[j, 2], weight = youtubeData_igraph[i,4])
    }
  }
}
rm(i)
rm(j)
rm(matching_rows)
rm(num_rows)

###Plots of igraph object

#Simple Plot
g
## IGRAPH c6c3e92 DNW- 295 280 -- 
## + attr: name (v/c), weight (e/n)
## + edges from c6c3e92 (vertex names):
##  [1] @Baron_von_Fargone->@jamesedginton9822
##  [2] @williamking5063  ->@jamesedginton9822
##  [3] @sylviabriggs4087 ->@jamesedginton9822
##  [4] @happydaysx335    ->@jamesedginton9822
##  [5] @oasis808         ->@jamesedginton9822
##  [6] @rochanahirun5490 ->@jamesedginton9822
##  [7] @microsoftuser5415->@jamesedginton9822
##  [8] @_Mango609        ->@jamesedginton9822
## + ... omitted several edges
plot.igraph(g,vertex.label = NA, vertex.size = 6)

# Plot with different layout
plot(g, layout = layout_nicely(g), vertex.label = NA, vertex.size = 8,edge.color = E(g)$weight)

#Plot with circle layout
l <- layout_in_circle(g)
plot(g, layout=l, vertex.label = NA, vertex.size=5,edge.color = E(g)$weight)

#Plot with sphere layout
l <- layout_on_sphere(g)
plot(g, layout=l,vertex.size = 7, vertex.label = NA)

# Heatmap of the network matrix:
netm <- get.adjacency(g, attr="weight", sparse=F)
colnames(netm) <- V(g)$media
rownames(netm) <- V(g)$media

palf <- colorRampPalette(c("gold", "dark orange"))
heatmap(netm[,17:1], Rowv = NA, Colv = NA, col = palf(20),
scale="none", margins=c(10,10) )

rm(netm)
rm(palf)
#Assigning colors to different types of comments
colors <- ifelse(type_list == "Parent", "red", "green")
types <- ifelse(type_list == "Parent", "circle", "square")

V(g)$color <- colors
V(g)$shape <- types
rm(types)
rm(colors)
#Plot
g
## IGRAPH c6c3e92 DNW- 295 280 -- 
## + attr: name (v/c), color (v/c), shape (v/c), weight (e/n)
## + edges from c6c3e92 (vertex names):
##  [1] @Baron_von_Fargone->@jamesedginton9822
##  [2] @williamking5063  ->@jamesedginton9822
##  [3] @sylviabriggs4087 ->@jamesedginton9822
##  [4] @happydaysx335    ->@jamesedginton9822
##  [5] @oasis808         ->@jamesedginton9822
##  [6] @rochanahirun5490 ->@jamesedginton9822
##  [7] @microsoftuser5415->@jamesedginton9822
##  [8] @_Mango609        ->@jamesedginton9822
## + ... omitted several edges
plot.igraph(g,vertex.label = NA, vertex.size = 6)

# Plot with "layout_nicely"
plot(g, layout = layout_nicely(g), vertex.label = NA, vertex.size = 8,edge.color = E(g)$weight)
## Warning in layout_nicely(g): Non-positive edge weight found, ignoring all
## weights during graph layout.

#Plot with circle layout
l <- layout_in_circle(g)
plot(g, layout=l, vertex.label = NA, vertex.size=5,edge.color = E(g)$weight)

#plot with sphere layout
l <- layout_on_sphere(g)
plot(g, layout=l,vertex.size = 7, vertex.label = NA)

rm(l)

###Attributes of the Graph

#Mean Edge weight
mean(edge_attr(g)$weight)
## [1] 97.63929
#Diameter
diameter(g, directed=F)
## [1] 5190
diam <- get_diameter(g, directed=F)
diam
## + 3/295 vertices, named, from c6c3e92:
## [1] @calebgao6931 @waffilushus  @dvandeun
rm(diam)
deg <- degree(g, mode = "all")
# Order vertices by degree 
sorted_vertices <- order(deg, decreasing = TRUE)

# Select the top 10 vertices
top_10_vertices <- head(sorted_vertices, 10)

# Display the top 10 vertices and their degrees
result <- data.frame(Vertex = top_10_vertices, Degree = deg[top_10_vertices])
print(result)
##                       Vertex Degree
## @pushinp2300               3     23
## @jeffcajou76               7     21
## @waffilushus               1     20
## @CharLie_69                2     20
## @Jasejamaica               4     20
## @AlasdairMacgillivray      5     20
## @Aerish369                 6     20
## @justforfunxxD             8     20
## @moe17                     9     20
## @Blurhornet               10     20
rm(result)
rm(sorted_vertices)
rm(top_10_vertices)
# Calculate the mean degree
mean_degree <- mean(deg)

# Calculate the median degree
median_degree <- median(deg)

# Display the mean and median degree
print(paste("Mean Degree:", mean_degree))
## [1] "Mean Degree: 1.89830508474576"
print(paste("Median Degree:", median_degree))
## [1] "Median Degree: 1"
rm(mean_degree)
rm(median_degree)
#Degree Distribution
degree_dist <- degree_distribution(g)
# Convert degree distribution to a data frame for ggplot
degree_df <- as.data.frame(table(degree_dist))

# Plot the degree distribution with ggplot2
ggplot(degree_df, aes(x = as.numeric(degree_dist), y = Freq)) +
  geom_bar(stat = "identity", fill = "skyblue", color = "black") +
  labs(title = "Degree Distribution",
       x = "Degree",
       y = "Frequency") +
  theme_minimal()

rm(degree_df)
rm(degree_dist)
rm(deg)
#Average path length, the mean of the shortest distance between each pair of nodes in the network
mean_distance(g, directed=F)
## [1] 315.239
#Some of the edges
head(as_edgelist(g, names=T))
##      [,1]                 [,2]                
## [1,] "@Baron_von_Fargone" "@jamesedginton9822"
## [2,] "@williamking5063"   "@jamesedginton9822"
## [3,] "@sylviabriggs4087"  "@jamesedginton9822"
## [4,] "@happydaysx335"     "@jamesedginton9822"
## [5,] "@oasis808"          "@jamesedginton9822"
## [6,] "@rochanahirun5490"  "@jamesedginton9822"
#Edge density
edge_density(g, loops=F)
## [1] 0.00322841
#Reciprocity
reciprocity(g)
## [1] 0

###Cloesness

head(closeness(g, mode="all", weights=NA),10)
##          @waffilushus           @CharLie_69          @pushinp2300 
##           0.050000000           0.052631579           0.006578947 
##          @Jasejamaica @AlasdairMacgillivray            @Aerish369 
##           0.055555556           0.014285714           0.100000000 
##          @jeffcajou76        @justforfunxxD                @moe17 
##           0.058823529           0.008620690           0.006410256 
##           @Blurhornet 
##           0.050000000
#Make the igraph object to tbl for the next plot
g <- as_tbl_graph(g)
g
## # A tbl_graph: 295 nodes and 280 edges
## #
## # A directed multigraph with 52 components
## #
## # A tibble: 295 × 3
##   name                  color shape 
##   <chr>                 <chr> <chr> 
## 1 @waffilushus          red   circle
## 2 @CharLie_69           red   circle
## 3 @pushinp2300          red   circle
## 4 @Jasejamaica          red   circle
## 5 @AlasdairMacgillivray red   circle
## 6 @Aerish369            red   circle
## # ℹ 289 more rows
## #
## # A tibble: 280 × 3
##    from    to weight
##   <int> <int>  <dbl>
## 1    16    13      1
## 2    17    13     10
## 3    18    13     80
## # ℹ 277 more rows
#Simple plot to see the structure of the network
g %>%
  activate(nodes) %>% 
  ggraph(layout = 'stress') +
  geom_edge_fan(width = .8, color = 'lightblue') + 
  coord_fixed() + 
  theme_graph()

#Getting the initial Data
youtubeData_igraph = youtubeData
youtubeData_igraph$CommentID <- sub(".*\\.(.*)", "\\1", youtubeData_igraph$CommentID)
youtubeData_igraph <- youtubeData_igraph[order(-youtubeData_igraph$ReplyCount), ]

# Selecting the top values of the "CommentID" by ReplyCount
top_comments <- head(youtubeData_igraph$CommentID, 64)

#Remove the parent comments that have less than 40 reply count
youtubeData_igraph <- youtubeData_igraph[!(youtubeData_igraph$ReplyCount < 40 & is.na(youtubeData_igraph$ParentID)), ]

#Remove the replies to the comments that are not in the top comments
youtubeData_igraph <- youtubeData_igraph[youtubeData_igraph$ParentID %in% top_comments | is.na(youtubeData_igraph$ParentID), ]
rm(top_comments)
#Creating the dataset for the VisNetwork graph
from_list = c()
to_list = c()
weight_list = c()
num_rows = nrow(youtubeData_igraph)
for (i in 1:num_rows){
  if (!is.na(youtubeData_igraph[i,7])){
    matching_rows <- which(youtubeData_igraph$CommentID == youtubeData_igraph[i,7])
    for (j in matching_rows){
      from_list <- append(from_list, youtubeData_igraph[i, 2])
      to_list <- append(to_list, youtubeData_igraph[j, 2])
      weight_list <- append(weight_list, youtubeData_igraph[i,4])
}}  else {
      from_list <- append(from_list, youtubeData_igraph[i, 2])
      to_list <- append(to_list, youtubeData_igraph[i, 2])
      weight_list <- append(weight_list, youtubeData_igraph[i,3])
  }
  }

rm(num_rows)
rm(j)
rm(i)